In [56]:
import polars as pl
import numpy as np
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'notebook'
# for Df
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, TensorDataset
Exercise to use polars with scikit-learn, Pytorch, Plotly¶
- using residual as my label instead of quality because for fun
- not an exercise to create the best model - just for polars integration
In [57]:
## Importing the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
raw_data = pl.read_csv(url, separator=';', infer_schema_length=10000)
raw_data.head()
Out[57]:
shape: (5, 12)
| fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality |
|---|---|---|---|---|---|---|---|---|---|---|---|
| f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | i64 |
| 7.4 | 0.7 | 0.0 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
| 7.8 | 0.88 | 0.0 | 2.6 | 0.098 | 25.0 | 67.0 | 0.9968 | 3.2 | 0.68 | 9.8 | 5 |
| 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15.0 | 54.0 | 0.997 | 3.26 | 0.65 | 9.8 | 5 |
| 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17.0 | 60.0 | 0.998 | 3.16 | 0.58 | 9.8 | 6 |
| 7.4 | 0.7 | 0.0 | 1.9 | 0.076 | 11.0 | 34.0 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
In [58]:
raw_data.schema
Out[58]:
Schema([('fixed acidity', Float64),
('volatile acidity', Float64),
('citric acid', Float64),
('residual sugar', Float64),
('chlorides', Float64),
('free sulfur dioxide', Float64),
('total sulfur dioxide', Float64),
('density', Float64),
('pH', Float64),
('sulphates', Float64),
('alcohol', Float64),
('quality', Int64)])
In [59]:
# check the describe
raw_data.describe()
Out[59]:
shape: (9, 13)
| statistic | fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| str | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 |
| "count" | 1599.0 | 1599.0 | 1599.0 | 1599.0 | 1599.0 | 1599.0 | 1599.0 | 1599.0 | 1599.0 | 1599.0 | 1599.0 | 1599.0 |
| "null_count" | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| "mean" | 8.319637 | 0.527821 | 0.270976 | 2.538806 | 0.087467 | 15.874922 | 46.467792 | 0.996747 | 3.311113 | 0.658149 | 10.422983 | 5.636023 |
| "std" | 1.741096 | 0.17906 | 0.194801 | 1.409928 | 0.047065 | 10.460157 | 32.895324 | 0.001887 | 0.154386 | 0.169507 | 1.065668 | 0.807569 |
| "min" | 4.6 | 0.12 | 0.0 | 0.9 | 0.012 | 1.0 | 6.0 | 0.99007 | 2.74 | 0.33 | 8.4 | 3.0 |
| "25%" | 7.1 | 0.39 | 0.09 | 1.9 | 0.07 | 7.0 | 22.0 | 0.9956 | 3.21 | 0.55 | 9.5 | 5.0 |
| "50%" | 7.9 | 0.52 | 0.26 | 2.2 | 0.079 | 14.0 | 38.0 | 0.99675 | 3.31 | 0.62 | 10.2 | 6.0 |
| "75%" | 9.2 | 0.64 | 0.42 | 2.6 | 0.09 | 21.0 | 62.0 | 0.99784 | 3.4 | 0.73 | 11.1 | 6.0 |
| "max" | 15.9 | 1.58 | 1.0 | 15.5 | 0.611 | 72.0 | 289.0 | 1.00369 | 4.01 | 2.0 | 14.9 | 8.0 |
In [60]:
from polars import selectors as cs
raw_data.select(cs.by_dtype(pl.Float64)).drop('residual sugar').columns
Out[60]:
['fixed acidity', 'volatile acidity', 'citric acid', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol']
In [61]:
# convert replace the " " with "_"
raw_data = raw_data.rename({col: col.replace(" ", "_") for col in raw_data.columns})
In [62]:
# det a description of the sugar & can ask the question can be create a binary data with balance sample number
raw_data['residual_sugar'].describe()
Out[62]:
shape: (9, 2)
| statistic | value |
|---|---|
| str | f64 |
| "count" | 1599.0 |
| "null_count" | 0.0 |
| "mean" | 2.538806 |
| "std" | 1.409928 |
| "min" | 0.9 |
| "25%" | 1.9 |
| "50%" | 2.2 |
| "75%" | 2.6 |
| "max" | 15.5 |
In [63]:
# plot a histogram to see the distribution of the residual_sugar
fig = px.histogram(raw_data, x='residual_sugar', nbins=10)
fig.update_layout(template='ggplot2', width=600, bargap=0.2)
fig.show()
In [64]:
residual_sugar_cat = raw_data.select(pl.when(pl.col('residual_sugar') > 4).then(pl.lit('High')).otherwise(pl.lit('Low'))).to_series().to_list()
# inspect the dataset
fig = px.scatter_matrix(
raw_data,
dimensions=raw_data.select(cs.by_dtype(pl.Float64)).drop('residual_sugar').columns,
color=residual_sugar_cat
)
fig.update_layout(template='ggplot2', width=1000, height=1000, font_size=8)
fig.update_traces(marker=dict(size=3, opacity=0.8, line=dict(width=0.2, color='darkslategrey')))
In [65]:
# no relationship between quality and residual sugar
fig = px.scatter(
raw_data,
y='residual_sugar',
x='quality'
)
fig.update_layout(template='ggplot2', width=500)
Interpretation¶
- initially I wanted to bin the residual sugar so I could predict "High" or "Low" residual sugar . However, from the exploration, there is no obvious relationship hence I dropped that idea.
Prepare the data¶
In [66]:
scaler = StandardScaler()
# drop the residual sugar
features = raw_data.drop('residual_sugar')
# fit transform the feature
scaled_features = scaler.fit_transform(features)
# create a polars dataframe from the scaled data
scaled_data = pl.DataFrame(scaled_features, schema=features.columns)
# add the residual sugar back
scaled_data = scaled_data.with_columns(raw_data['residual_sugar'])
In [67]:
# inspect the scaled data
scaled_data
Out[67]:
shape: (1_599, 12)
| fixed_acidity | volatile_acidity | citric_acid | chlorides | free_sulfur_dioxide | total_sulfur_dioxide | density | pH | sulphates | alcohol | quality | residual_sugar |
|---|---|---|---|---|---|---|---|---|---|---|---|
| f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 | f64 |
| -0.52836 | 0.961877 | -1.391472 | -0.243707 | -0.466193 | -0.379133 | 0.558274 | 1.288643 | -0.579207 | -0.960246 | -0.787823 | 1.9 |
| -0.298547 | 1.967442 | -1.391472 | 0.223875 | 0.872638 | 0.624363 | 0.028261 | -0.719933 | 0.12895 | -0.584777 | -0.787823 | 2.6 |
| -0.298547 | 1.297065 | -1.18607 | 0.096353 | -0.083669 | 0.229047 | 0.134264 | -0.331177 | -0.048089 | -0.584777 | -0.787823 | 2.3 |
| 1.654856 | -1.384443 | 1.484154 | -0.26496 | 0.107592 | 0.4115 | 0.664277 | -0.979104 | -0.46118 | -0.584777 | 0.450848 | 1.9 |
| -0.52836 | 0.961877 | -1.391472 | -0.243707 | -0.466193 | -0.379133 | 0.558274 | 1.288643 | -0.579207 | -0.960246 | -0.787823 | 1.9 |
| … | … | … | … | … | … | … | … | … | … | … | … |
| -1.217796 | 0.403229 | -0.980669 | 0.053845 | 1.542054 | -0.075043 | -0.978765 | 0.899886 | -0.46118 | 0.072294 | -0.787823 | 2.0 |
| -1.390155 | 0.123905 | -0.877968 | -0.541259 | 2.211469 | 0.13782 | -0.862162 | 1.353436 | 0.601055 | 0.729364 | 0.450848 | 2.2 |
| -1.160343 | -0.099554 | -0.723916 | -0.243707 | 1.255161 | -0.196679 | -0.533554 | 0.705508 | 0.542042 | 0.54163 | 0.450848 | 2.3 |
| -1.390155 | 0.65462 | -0.775267 | -0.26496 | 1.542054 | -0.075043 | -0.676657 | 1.6774 | 0.30599 | -0.209308 | -0.787823 | 2.0 |
| -1.332702 | -1.216849 | 1.021999 | -0.43499 | 0.203223 | -0.135861 | -0.666057 | 0.51113 | 0.010924 | 0.54163 | 0.450848 | 3.6 |
In [68]:
corr_labels = scaled_data.columns
fig = px.imshow(
scaled_data.corr(),
y=corr_labels,
text_auto='.1f',
zmin=-1,
zmax=1,
color_continuous_scale='RdBu'
)
fig.update_xaxes(side='top')
fig.update_layout(template='ggplot2', width=700, height=700)
In [69]:
dataT = torch.tensor(scaled_data.drop('residual_sugar').to_numpy()).float()
labelT = torch.tensor(scaled_data['residual_sugar'].to_numpy()).float().unsqueeze(1) # unsqueeze to convert to 2D
In [70]:
print(labelT.shape)
print(dataT.shape)
torch.Size([1599, 1]) torch.Size([1599, 11])
Partition the data and loading into DataLoader¶
In [71]:
X_train, X_test, y_train, y_test = train_test_split(dataT, labelT, test_size=0.3, random_state=42)
In [72]:
# create pytorch dataset
train_data = TensorDataset(X_train, y_train)
test_data = TensorDataset(X_test, y_test)
In [73]:
# translate the data load objects
train_loader = DataLoader(train_data, batch_size=32, shuffle=True, drop_last=True)
test_loader = DataLoader(test_data, batch_size=test_data.tensors[0].shape[0])
In [74]:
# check for the shape and dimension of the training tensor data stored in DataLoader object
train_loader.dataset.tensors[0].shape
Out[74]:
torch.Size([1119, 11])
In [75]:
test_loader.dataset.tensors[0].shape
Out[75]:
torch.Size([480, 11])
In [76]:
print(raw_data.shape)
1439 + 160
(1599, 12)
Out[76]:
1599
Create a the model class¶
In [77]:
class ANN(nn.Module):
def __init__(self):
super().__init__()
# create the input layer
self.input = nn.Linear(11, 16)
# create the hidden layers
self.hidden1 = nn.Linear(16, 32)
self.hidden2 = nn.Linear(32, 32)
# create the output layer
self.output = nn.Linear(32, 1)
def forward(self, x):
x = F.relu(self.input(x))
x = F.relu(self.hidden1(x))
x = F.relu(self.hidden2(x))
return self.output(x)
Create a function to train the model¶
In [78]:
def trainModel(model, train_loader, test_loader, numepochs=200, learning_rate=0.01):
# define an empty dataframe
# doesnt need bracket because it is with a schema defination
training_progess = pl.DataFrame(
schema={
'iteration' : pl.Int64,
'epoch' : pl.Int64,
'batch_train_loss': pl.Float64,
'batch_test_loss' : pl.Float64
}
)
# define the loss func and optimizer
lossfunc = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
iteration = 0 # technically this is redundant because I am not testing multiple batch_size
for epoch in range(numepochs):
model.train()
for X, y in train_loader:
yhat = model(X)
loss = lossfunc(yhat, y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
# evaluate the test
model.eval()
X_test, y_test = next(iter(test_loader))
with torch.no_grad():
yhat_test = model(X_test)
loss_test = lossfunc(yhat_test, y_test)
# switch batch to training
model.train()
# record the results in this iteration
# remember to wrappe dthis in a list because polars expacts an iterable of records when creating dataframe
new_row = pl.DataFrame([{
'iteration' : iteration,
'epoch' : epoch,
'batch_train_loss' : loss.item(),
'batch_test_loss' : loss_test.item()
}])
training_progess = pl.concat([training_progess, new_row])
# move the iteration up at the end of this loop
iteration += 1
return training_progess
Note about dataframe creating in polars¶
- the square brackets [] around the dictionary is because we are providing a list containing a single dictionary, where each dictionary represents one row of data. This approach is used when you want to create a DataFrame with specific rows of data
- The [{...}] syntax means "a list containing one dictionary" - that dictionary becomes one row in your DataFrame.
In [79]:
# create a model and run it
# model = ANN()
# test the model
# model(torch.randn(10, 11)).shape
train the model¶
In [80]:
# create model class
model = ANN()
# train the model
training_progess = trainModel(model, train_loader, test_loader, numepochs=200, learning_rate=0.01)
In [81]:
# check the training progres
print(training_progess)
shape: (6_800, 4) ┌───────────┬───────┬──────────────────┬─────────────────┐ │ iteration ┆ epoch ┆ batch_train_loss ┆ batch_test_loss │ │ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ i64 ┆ f64 ┆ f64 │ ╞═══════════╪═══════╪══════════════════╪═════════════════╡ │ 0 ┆ 0 ┆ 8.357491 ┆ 7.801715 │ │ 1 ┆ 0 ┆ 7.378495 ┆ 7.452112 │ │ 2 ┆ 0 ┆ 6.096545 ┆ 7.126534 │ │ 3 ┆ 0 ┆ 13.573361 ┆ 6.742377 │ │ 4 ┆ 0 ┆ 9.127261 ┆ 6.389193 │ │ … ┆ … ┆ … ┆ … │ │ 6795 ┆ 199 ┆ 0.182057 ┆ 0.601537 │ │ 6796 ┆ 199 ┆ 0.159327 ┆ 0.597312 │ │ 6797 ┆ 199 ┆ 0.110114 ┆ 0.606896 │ │ 6798 ┆ 199 ┆ 0.657421 ┆ 0.616068 │ │ 6799 ┆ 199 ┆ 0.103466 ┆ 0.618086 │ └───────────┴───────┴──────────────────┴─────────────────┘
In [82]:
# groupby analysis of each epoch to have less granular data
grouped_epoch_training = (
training_progess.group_by('epoch')
.agg(
# pl.mean('batch_train_loss').alias('batch_train_loss_mean'),
# pl.mean('batch_test_loss').alias('batch_test_loss_mean'),
batch_train_loss_mean=pl.mean('batch_train_loss'),
batch_test_loss_mean=pl.mean('batch_test_loss')
)
)
grouped_epoch_training.head()
Out[82]:
shape: (5, 3)
| epoch | batch_train_loss_mean | batch_test_loss_mean |
|---|---|---|
| i64 | f64 | f64 |
| 0 | 4.469063 | 3.823411 |
| 1 | 2.040808 | 1.728339 |
| 2 | 1.909708 | 1.631237 |
| 3 | 1.850119 | 1.554289 |
| 4 | 1.68104 | 1.477889 |
Plot the training and test performance¶
In [83]:
fig = px.scatter(
data_frame=grouped_epoch_training,
x='epoch',
y=['batch_train_loss_mean', 'batch_test_loss_mean'],
labels={'variable' : 'Performance Metrics', 'value' : 'Loss'}
)
fig.update_layout(template='ggplot2', width=600)
fig.show()
In [84]:
training_progess.head()
Out[84]:
shape: (5, 4)
| iteration | epoch | batch_train_loss | batch_test_loss |
|---|---|---|---|
| i64 | i64 | f64 | f64 |
| 0 | 0 | 8.357491 | 7.801715 |
| 1 | 0 | 7.378495 | 7.452112 |
| 2 | 0 | 6.096545 | 7.126534 |
| 3 | 0 | 13.573361 | 6.742377 |
| 4 | 0 | 9.127261 | 6.389193 |
In [85]:
fig = px.scatter(
data_frame=training_progess,
x='iteration',
y=['batch_train_loss', 'batch_test_loss'],
labels={'variable': 'Performance Metrics', 'value': 'Loss'}
)
fig.update_layout(template='ggplot2', width=600)
fig.show()
Compare the predictions between train and test dataset¶
In [86]:
model.eval()
# using the trained model
with torch.no_grad():
yHatTrain = model(X_train)
yHatTest = model(X_test)
print(yHatTest.shape)
print(yHatTrain.shape)
torch.Size([480, 1]) torch.Size([1119, 1])
In [87]:
# Convert PyTorch tensors to Polars DataFrames directly
train_df = pl.DataFrame({
'True': y_train.squeeze().cpu().numpy(),
'Predicted': yHatTrain.squeeze().cpu().numpy(),
'Set': ['Train'] * len(y_train)
})
test_df = pl.DataFrame({
'True': y_test.squeeze().cpu().numpy(),
'Predicted': yHatTest.squeeze().cpu().numpy(),
'Set': ['Test'] * len(y_test)
})
# Concatenate using Polars' concat function
df_pred = pl.concat([train_df, test_df])
In [88]:
df_pred
Out[88]:
shape: (1_599, 3)
| True | Predicted | Set |
|---|---|---|
| f32 | f32 | str |
| 1.9 | 1.91981 | "Train" |
| 2.0 | 2.319143 | "Train" |
| 2.6 | 3.483783 | "Train" |
| 2.1 | 2.209713 | "Train" |
| 2.2 | 2.164838 | "Train" |
| … | … | … |
| 2.1 | 3.494391 | "Test" |
| 2.6 | 3.069432 | "Test" |
| 1.7 | 2.003307 | "Test" |
| 2.4 | 3.661964 | "Test" |
| 2.3 | 2.020392 | "Test" |
In [89]:
fig = px.scatter(
df_pred,
x='Predicted',
y='True',
color='Set',
symbol='Set',
title='Model Predictions vs. True Values',
labels={'Predicted': 'Predicted Residual Sugar', 'True': 'True Residual Sugar'},
template='ggplot2'
)
fig.update_layout(template='ggplot2', width=600)
fig.update_traces(marker=dict(line=dict(width=0.4), opacity=0.7))